Package org.terrier.structures.indexing

Source Code of org.terrier.structures.indexing.LexiconBuilder

/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.ac.uk/
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is LexiconBuilder.java.
*
* The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
*   Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (original author)
*   Vassilis Plachouras <vassilis{a.}dcs.gla.ac.uk>
*/
package org.terrier.structures.indexing;
import java.io.Closeable;
import java.io.IOException;
import java.util.Arrays;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.PriorityQueue;

import org.apache.hadoop.io.Text;
import org.apache.log4j.Logger;

import org.terrier.structures.FSOMapFileLexicon;
import org.terrier.structures.FSOMapFileLexiconOutputStream;
import org.terrier.structures.FieldLexiconEntry;
import org.terrier.structures.Index;
import org.terrier.structures.LexiconEntry;
import org.terrier.structures.LexiconOutputStream;
import org.terrier.structures.seralization.FixedSizeWriteableFactory;
import org.terrier.utility.ApplicationSetup;
/**
* Builds temporary lexicons during indexing a collection and
* merges them when the indexing of a collection has finished.
* @author Craig Macdonald &amp; Vassilis Plachouras
  */
public class LexiconBuilder
{

  /** class to be used as a lexiconoutpustream. set by this and child classes */
  @SuppressWarnings("unchecked") //TODO : this is complicated to fix
  protected Class<? extends LexiconOutputStream> lexiconOutputStream = null;

  //protected Class<? extends LexiconMap> LexiconMapClass = null;
 
  protected final String lexiconEntryFactoryValueClass;
 
  /** The logger used for this class */
  protected static final Logger logger = Logger.getLogger(LexiconBuilder.class);
 
  /** How many documents have been processed so far.*/
  protected int DocCount = 0;

  /** How many terms are in the final lexicon */
  protected int TermCount = 0;
 
  /** The number of documents for which a temporary lexicon is created.
   * Corresponds to property <tt>bundle.size</tt>, default value 2000. */
  protected static final int DocumentsPerLexicon = Integer.parseInt(ApplicationSetup.getProperty("bundle.size", "2000"));
  /** The linkedlist in which the temporary lexicon structure names are stored.
    * These are merged into a single Lexicon by the merge() method.
    * LinkedList is best List implementation for this, as all operations
    * are either append element, or remove first element - making LinkedList
    * ideal. */
  protected final LinkedList<String> tempLexFiles = new LinkedList<String>();
 
  /** The lexicontree to write the current term stream to */
  protected LexiconMap TempLex;
 
  /** The directory to write the final lexicons to */
  protected String indexPath = null;
  /** The filename of the lexicons. */
  protected String indexPrefix = null;
 
  protected Index index = null;
 
  /** How many temporary lexicons have been generated so far */
  protected int TempLexCount = 0;
 
  /** Should we only merge lexicons in pairs (Terrier 1.0.x scheme)? Set by property <tt>lexicon.builder.merge.2lex.attime</tt> */
  protected static final boolean MERGE2LEXATTIME = Boolean.parseBoolean(ApplicationSetup.getProperty("lexicon.builder.merge.2lex.attime", "false"));

  /** Number of lexicons to merge at once. Set by property <tt>lexicon.builder.merge.lex.max</tt>, defaults to 16 */
  protected static final int MAXLEXMERGE = Integer.parseInt(ApplicationSetup.getProperty("lexicon.builder.merge.lex.max", "16"));
  /**
   * Counter of LexiconEntries
   */
  public interface CollectionStatisticsCounter extends Closeable
  {
    /**
     * The current count
     */
    void count(LexiconEntry value);
  }
 
  /** counts global statistics in the fields case */
  protected static class FieldLexiconCollectionStaticticsCounter
    implements CollectionStatisticsCounter
  {
    long numberOfTokens = 0;
    int numberOfTerms = 0;
    long numberOfPointers = 0;
    final Index index;
    int numFields;
    final long[] tokensF;
   
    public FieldLexiconCollectionStaticticsCounter(Index _index, int _numFields)
    {
      index = _index;
      numFields = _numFields;
      tokensF = new long[numFields];
    }
   
    public void count(LexiconEntry value)
    {
      numberOfTokens += value.getFrequency();
      numberOfPointers += value.getDocumentFrequency();
      numberOfTerms++;
      int[] fieldFreqs = ((FieldLexiconEntry)value).getFieldFrequencies();
      for(int fi = 0; fi < numFields; fi++)
      {
        tokensF[fi] += (long)fieldFreqs[fi];
      }
    }
 
    public void close()
    {
      if (index != null)
      {
        index.setIndexProperty("num.Terms", ""+numberOfTerms);
        index.setIndexProperty("num.Tokens", ""+numberOfTokens);
        index.setIndexProperty("num.Pointers", ""+numberOfPointers);
        for(int fi = 0; fi < numFields; fi++)
        {
          index.setIndexProperty("num.field."+fi+".Tokens", ""+ tokensF[fi]);
        }
      }
    }
  }
 
  protected static class NullCollectionStatisticsCounter implements CollectionStatisticsCounter
  {

    public void count(LexiconEntry value) {
     
    }

    public void close() throws IOException {
     
    }
   
  }
 
  /** counts global statistics in the non-fields case */
  public static class BasicLexiconCollectionStaticticsCounter
    implements CollectionStatisticsCounter
  {
    String midfix = "";
    long numberOfTokens = 0;
    int numberOfTerms = 0;
    long numberOfPointers = 0;
    final Index index;
    /**
     * constructor
     * @param _index
     */
    public BasicLexiconCollectionStaticticsCounter(Index _index)
    {
      index = _index;
    }
    /**
     * constructor
     * @param _index
     * @param subset_name
     */
    public BasicLexiconCollectionStaticticsCounter(Index _index, String subset_name)
    {
      index = _index;
      midfix = '.'+subset_name;
    }
       
    /**
     * {@inheritDoc}
     */
    public void count(LexiconEntry value)
    {
      numberOfTokens += value.getFrequency();
      numberOfPointers += value.getDocumentFrequency();
      numberOfTerms++;
    }
    /**
     * {@inheritDoc}
     */
    public void close()
    {
      if (index != null)
      {
        index.setIndexProperty("num"+midfix+".Terms", ""+numberOfTerms);
        index.setIndexProperty("num"+midfix+".Tokens", ""+numberOfTokens);
        index.setIndexProperty("num"+midfix+".Pointers", ""+numberOfPointers);
      }
    }
  }
 
  protected static LexiconMap instantiate(Class<? extends LexiconMap> LexiconMapClass)
  {
    LexiconMap TempLex = null;
    try{ TempLex = (LexiconMap) LexiconMapClass.newInstance(); } catch (Exception e) {logger.error(e);}
    return TempLex;
  }
 
  protected String defaultStructureName;
  protected FixedSizeWriteableFactory<LexiconEntry> valueFactory;
 
  /**
   * constructor
   * @param i
   * @param _structureName
   */
  public LexiconBuilder(Index i, String _structureName) {
    this(i, _structureName,
        instantiate(LexiconMap.class), "org.terrier.structures.BasicLexiconEntry");
  }
  /**
   * constructor
   * @param i
   * @param _structureName
   * @param _LexiconMapClass
   * @param _lexiconEntryClass
   */
  public LexiconBuilder(Index i, String _structureName,
      Class <? extends LexiconMap> _LexiconMapClass,
      String _lexiconEntryClass)
  {
    this(i, _structureName, instantiate(_LexiconMapClass), _lexiconEntryClass);
  }
  /**
   * constructor
   * @param i
   * @param _structureName
   * @param lexiconMap
   * @param _lexiconEntryClass
   */
  @SuppressWarnings("unchecked")
  public LexiconBuilder(Index i, String _structureName,
        LexiconMap lexiconMap,
        String _lexiconEntryClass)
  {
    this.index = i;
    this.indexPath = index.getPath();
    this.indexPrefix = index.getPrefix();
    this.defaultStructureName = _structureName;
    this.TempLex = lexiconMap;
    //TemporaryLexiconDirectory = indexPath + ApplicationSetup.FILE_SEPARATOR + indexPrefix + "_";
    //LexiconMapClass = lexiconMap; 
    lexiconEntryFactoryValueClass = _lexiconEntryClass;
   
 
    this.index.addIndexStructure(
        defaultStructureName+"-keyfactory",
        "org.terrier.structures.seralization.FixedSizeTextFactory",
        "java.lang.String",
        "${max.term.length}"
        );
    if (this.index.getIndexProperty("max.term.length", null) == null)
      this.index.setIndexProperty("max.term.length", ApplicationSetup.getProperty("max.term.length", ""+20));
    this.index.addIndexStructure(defaultStructureName+"-valuefactory", lexiconEntryFactoryValueClass+"$Factory", "", "");
    valueFactory = (FixedSizeWriteableFactory<LexiconEntry>)this.index.getIndexStructure(defaultStructureName+"-valuefactory");
    lexiconOutputStream = LexiconOutputStream.class;
  }

  /** Returns the number of terms in the final lexicon. Only updated once finishDirectIndexBuild() has executed */
  public int getFinalNumberOfTerms()
  {
    return TermCount;
  }

  /** If the application code generated lexicons itself, use this method to add them to the merge list
    * Otherwise dont touch this method.
    * @param structureName Fully path to a lexicon to merge
    * @deprecated */
  public void addTemporaryLexicon(String structureName) {
    tempLexFiles.addLast(structureName);
    //filename = ApplicationSetup.makeAbsolute(filename, TemporaryLexiconDirectory);
  }

  /** Writes the current contents of TempLex temporary lexicon binary tree down to
    * a temporary disk lexicon.
    */
  protected void writeTemporaryLexicon()
  {
    try{
      //TempLexDirCount = TempLexCount / TempLexPerDir;
      //if (! Files.exists(TemporaryLexiconDirectory + TempLexDirCount)) {
      //  String tmpDir = TemporaryLexiconDirectory + TempLexDirCount;
      //  Files.mkdir(tmpDir);
      //  Files.deleteOnExit(tmpDir);//it's fine to mark the temporary *directory* for deletion
      //}
      //String tmpLexName = TemporaryLexiconDirectory + TempLexDirCount + ApplicationSetup.FILE_SEPARATOR + TempLexCount;
      //LexiconOutputStream<String> los = getLexOutputStream(TempLexDirCount+""+TempLexCount);
      final String tmpLexName = this.defaultStructureName+"-tmp"+ TempLexCount;
      LexiconOutputStream<String> los = getLexOutputStream(tmpLexName);
      TempLex.storeToStream(los);
      los.close();
      /* An alternative but deprecated method to store the temporary lexicons is:
       * TempLex.storeToFile(tmpLexName); */
      //tempLexFiles.addLast(TempLexDirCount+""+TempLexCount);
      tempLexFiles.addLast(tmpLexName);
    }catch(IOException ioe){
      logger.error("Indexing failed to write a lexicon to disk : ", ioe);
    }   
  }

  /** Add a single term to the lexicon being built
    * @param term The String term
    * @param tf the frequency of the term */ 
  public void addTerm(String term, int tf)
  {
    TempLex.insert(term,tf);
  }

  /** adds the terms of a document to the temporary lexicon in memory.
    * @param terms DocumentPostingList the terms of the document to add to the temporary lexicon */
  public void addDocumentTerms(DocumentPostingList terms)
  {
    TempLex.insert(terms);
    DocCount++;
    if((DocCount % DocumentsPerLexicon) == 0)
    {
      if (logger.isDebugEnabled())
        logger.debug("flushing lexicon");
      writeTemporaryLexicon();
      TempLexCount++;
      TempLex.clear();
      //try{ TempLex = (LexiconMap)LexiconMapClass.newInstance(); } catch (Exception e) {logger.error(e);}
    }
  }

  /** Force a temporary lexicon to be flushed */
  public void flush()
  {
    if (logger.isDebugEnabled())
      logger.debug("flushing lexicon");
    writeTemporaryLexicon();
    TempLexCount++;
    TempLex.clear();
  }
 
  /**
   * Processing the lexicon after finished creating the
   * inverted index.
   */
  public void finishedInvertedIndexBuild() {
    optimiseLexicon();
  }
 
 
 
  /**
   * Processing the lexicon after finished creating the
   * direct and document indexes.
   */
  public void finishedDirectIndexBuild()
  {
    if (logger.isDebugEnabled())
      logger.debug("flushing lexicon to disk after the direct index completed");
    //only write a temporary lexicon if there are any items in it
    if (TempLex.getNumberOfNodes() > 0)
      writeTemporaryLexicon();
    TempLex = null;

    //merges the temporary lexicons
    if (tempLexFiles.size() > 0)
    {
      //Set<String> tempDirectories = new HashSet<String>();
      //for(String tmpLex : tempLexFiles)
      //{
      //  tempDirectories.add(Files.getParent(tmpLex));
      //}
      try{
        merge(tempLexFiles);
       
        //creates the offsets and hash file
        optimiseLexicon();
      } catch(IOException ioe){
        logger.error("Indexing failed to merge temporary lexicons to disk : ", ioe);
      }
      //for (String tmpDir : tempDirectories)
      //{
      //  Files.delete(tmpDir);
      //}
   
//    else
      //logger.warn("No temporary lexicons to merge, skipping");
  }
 
  /**
   * Merges the intermediate lexicon files created during the indexing.
   * @param filesToMerge java.util.LinkedList the list containing the
   *    filenames of the temporary files.
   * @throws IOException an input/output exception is throws
   *     if a problem is encountered.
   */
  @SuppressWarnings("unchecked")
  public void merge(LinkedList<String> filesToMerge) throws IOException {
    //now the merging of the files in the filesToMerge vector
    //must take place.
    //Several strategies exist here:
    // a. number to merge is 0 - error condition?
    // b. number ito merge is 1 - none to merge, just rename it
    // c. merge 2 at a time in pairs (default to 1.0.2)
    // d. merge N at once (N is a constant)
    // e. merge all at once.


    final int mergeNMaxLexicon = MAXLEXMERGE;
    final int StartFileCount = filesToMerge.size();
    //logger.info(StartFileCount+ " lexicons to merge");
    if (StartFileCount == 0)
    {
      //logger.warn("Tried to merge 0 lexicons. That's funnny. Is everything ok?");
      return;
    }
    if (StartFileCount == 1)
    {
      FSOMapFileLexicon.renameMapFileLexicon(filesToMerge.removeFirst(), index.getPath(), index.getPrefix(),
          defaultStructureName, index.getPath(), index.getPrefix());
    }
    else if (MERGE2LEXATTIME)
    {
      //more than 1 lexicon to merge, but configured only to merge 2 at a time
      if (logger.isDebugEnabled())
        logger.debug("begin merging "+ StartFileCount +" temporary lexicons, in pairs...");
      long startTime = System.currentTimeMillis();
      int progressiveNumber = 0;
      String newMergedFile = null;
      while (filesToMerge.size() > 1) {
        String fileToMerge1 = (String) filesToMerge.removeFirst();
        String fileToMerge2 = (String) filesToMerge.removeFirst();
       
        //give the proper name to the final merged lexicon
        if (filesToMerge.size() == 0)
          newMergedFile = defaultStructureName;
        else
          newMergedFile = defaultStructureName + "-mergetmp"+ String.valueOf(progressiveNumber++);
 
        //The opening of the files needs to break into more steps, so that
        //all the open streams are closed after the completion of the
        //operation, and eventually the intermediate files are deleted.

        Iterator<Map.Entry<String,LexiconEntry>> lis1 = getLexInputStream(fileToMerge1);
        Iterator<Map.Entry<String,LexiconEntry>> lis2 = getLexInputStream(fileToMerge2);
        LexiconOutputStream<String> los = getLexOutputStream(newMergedFile);
 
        if (logger.isDebugEnabled())
          logger.debug(
            "merging "
              + fileToMerge1
              + " with "
              + fileToMerge2
              + " to "
              + newMergedFile);
       
        mergeTwoLexicons(lis1, lis2, los);
     
        //delete the two files just merged
        FSOMapFileLexicon.deleteMapFileLexicon(fileToMerge1, indexPath, indexPrefix);
        FSOMapFileLexicon.deleteMapFileLexicon(fileToMerge2, indexPath, indexPrefix);
        filesToMerge.addLast(newMergedFile);
      }
      long endTime = System.currentTimeMillis();
      if (logger.isDebugEnabled())
        logger.debug("end of merging...("+((endTime-startTime)/1000.0D)+" seconds)");
    }
    else if (mergeNMaxLexicon > 0 && StartFileCount > mergeNMaxLexicon)
    {
      if (logger.isDebugEnabled())
        logger.debug("begin merging "+ StartFileCount +" files in batches of upto "+mergeNMaxLexicon+"...");
      long startTime = System.currentTimeMillis();
      int progressiveNumber = 0;
 

      while (filesToMerge.size() > 1)
      {
        final int numLexicons = Math.min(filesToMerge.size(), mergeNMaxLexicon);
        if (logger.isDebugEnabled())
           logger.debug("merging "+ numLexicons + " temporary lexicons");
        final String inputLexiconFileNames[] = new String[numLexicons];
        final Iterator<Map.Entry<String,LexiconEntry>>[] lis = (Iterator<Map.Entry<String,LexiconEntry>>[])new Iterator[numLexicons];

        for(int i=0;i<numLexicons;i++)
        {
          inputLexiconFileNames[i] =  filesToMerge.removeFirst();
          lis[i] = getLexInputStream(inputLexiconFileNames[i]);
        }

        String newMergedFile = null;
        //give the proper name to the final merged lexicon
        if (filesToMerge.size() == 0)
          newMergedFile = defaultStructureName;
        else
          newMergedFile = defaultStructureName + "-mergetmp"+ String.valueOf(progressiveNumber++);

        final LexiconOutputStream<String> los = getLexOutputStream(newMergedFile);
        mergeNLexicons(lis, los);
        for(String inputLexiconFileName : inputLexiconFileNames)
        {
          FSOMapFileLexicon.deleteMapFileLexicon(inputLexiconFileName, index.getPath(), index.getPrefix());
        }
        filesToMerge.addLast(newMergedFile);
      }
      long endTime = System.currentTimeMillis();
      if (logger.isDebugEnabled())
        logger.debug("end of merging...("+((endTime-startTime)/1000.0D)+" seconds)");
    } else {
      //merge all lexicons at once, regardless of how many exist
       if (logger.isDebugEnabled())
        logger.debug("begin merging "+ StartFileCount +" temporary lexicons at once...");
      long startTime = System.currentTimeMillis();
      final String inputLexiconFileNames[] = new String[StartFileCount];
      final Iterator<Map.Entry<String,LexiconEntry>>[] lis =
        (Iterator<Map.Entry<String,LexiconEntry>>[]) new Iterator[StartFileCount];
     
      for(int i=0;i<StartFileCount;i++)
      {
        inputLexiconFileNames[i] = filesToMerge.removeFirst();
        lis[i] = getLexInputStream(inputLexiconFileNames[i]);
        //logger.debug(i+" "+inputLexiconFileNames[i]);
      }
      final LexiconOutputStream<String> los = getLexOutputStream(defaultStructureName);
      mergeNLexicons(lis, los);
      for(int i=0;i<StartFileCount;i++)
      {
        FSOMapFileLexicon.deleteMapFileLexicon(inputLexiconFileNames[i], index.getPath(), index.getPrefix());
      }
      long endTime = System.currentTimeMillis();
      if (logger.isDebugEnabled())
        logger.debug("end of merging...("+((endTime-startTime)/1000.0D)+" seconds)");
    }
    FSOMapFileLexiconOutputStream.addLexiconToIndex(this.index, defaultStructureName, lexiconEntryFactoryValueClass+"$Factory");
  }
 
  protected LexiconEntry newLexiconEntry(int termid)
  {
    LexiconEntry rtr = valueFactory.newInstance();
    rtr.setTermId(termid);
    return rtr;
  }
 
  @SuppressWarnings("unchecked")
  protected void mergeNLexicons(Iterator<Map.Entry<String,LexiconEntry>>[] lis, LexiconOutputStream<String> los) throws IOException
  {
    final int numLexicons = lis.length;
    boolean hasMore[] = new boolean[numLexicons];
    Map.Entry<String,LexiconEntry>[] currentEntries = new Map.Entry[numLexicons];
   
    Arrays.fill(hasMore, false);
    PriorityQueue<String> terms = new PriorityQueue<String>(numLexicons);
    for(int i=0;i<numLexicons;i++)
    {
      hasMore[i] = lis[i].hasNext();
      if (hasMore[i])
      {
        currentEntries[i] = lis[i].next();
        terms.add(currentEntries[i].getKey());
      }
      else
      {
        currentEntries[i] = null;
      }
       
    }
    String targetTerm= null;
    int targetTermId  = -1;
    LexiconEntry nextEntryToWrite = null;
    while(terms.size() > 0)
    {
      //what term are we working on
      targetTerm = terms.poll();
      //logger.debug("Current term is "+targetTerm + "length="+targetTerm.length());
      //for each input lexicon
      for(int i=0;i<numLexicons;i++)
      {
        //does this lexicon contain the term
        //logger.debug("Checking lexicon "+i+" for "+targetTerm+"="+lis[i].getTerm());
        if(hasMore[i] && currentEntries[i].getKey().equals(targetTerm))
        {
          if (targetTermId == -1)
          {  //obtain the termid for this term from the first lexicon that has the term
            nextEntryToWrite = newLexiconEntry(targetTermId = currentEntries[i].getValue().getTermId());
          }
          else if (targetTermId != currentEntries[i].getValue().getTermId())
          {  //check the termids match for this term
            logger.error("Term "+targetTerm+" had two termids ("+targetTermId+","+currentEntries[i].getValue().getTermId()+")");
          }
          //logger.debug("Term "+targetTerm + " found in "+i + "termid="+ lis[i].getTermId());
          nextEntryToWrite.add(currentEntries[i].getValue());
          hasMore[i] = lis[i].hasNext();
         
          if (hasMore[i])
          {
            currentEntries[i] = lis[i].next();
            terms.add(currentEntries[i].getKey());
          }
          else
          {
            currentEntries[i] = null;
          }
          break;
        }
      }
      if (terms.size()>0 && !terms.peek().equals(targetTerm))
      {
        if (targetTermId == -1)
        {
          logger.error("Term "+ targetTerm + " not found in any lexicons");
        }
        //end of this term, so we can write the lexicon entry
        los.writeNextEntry(targetTerm, nextEntryToWrite);
        nextEntryToWrite = null; targetTermId = -1; targetTerm = null;
      }
    }
    if (targetTermId != -1)
      los.writeNextEntry(targetTerm, nextEntryToWrite);
    los.close();
    for(int i=0;i<numLexicons;i++)
    {
      if (lis[i] instanceof Closeable)
        ((Closeable)lis[i]).close();
    }
  }
   

  /** Merge the two LexiconInputStreams into the given LexiconOutputStream
    * @param lis1 First lexicon to be merged
    * @param lis2 Second lexicon to be merged
    * @param los Lexion to be merged to
    */
  protected void mergeTwoLexicons(
      Iterator<Map.Entry<String,LexiconEntry>> lis1,
      Iterator<Map.Entry<String,LexiconEntry>> lis2,
      LexiconOutputStream<String> los) throws IOException
  {

    //We always take the first two entries of
    //the vector, merge them, store the new lexicon in the directory
    //of the first of the two merged lexicons, and put the filename
    //of the new lexicon file at the back of the vector. The first
    //two entries that were merged are removed from the vector. The
    //use of the vector is similar to a FIFO queue in this case.

    boolean hasMore1 = true;
    boolean hasMore2 = true;
    int termID1 = 0;
    int termID2 = 0;

 
    hasMore1 = lis1.hasNext();
    hasMore2 = lis2.hasNext();
    String sTerm1 = null;
    String sTerm2 = null;
    Map.Entry<String, LexiconEntry> lee1 = null;
    Map.Entry<String, LexiconEntry> lee2 = null;
    if (hasMore1) {
      lee1 = lis1.next();
      termID1 = lee1.getValue().getTermId();
      sTerm1 = lee1.getKey();
    }
    if (hasMore2) {
      lee2 = lis2.next();
      termID2 = lee2.getValue().getTermId();
      sTerm2 = lee2.getKey();
    }
    while (hasMore1 && hasMore2) {
      int compareString = 0;
      if (termID1 != termID2)
      {
        compareString = sTerm1.compareTo(sTerm2);
        if (compareString == 0)//, but termids don't match
        {
          logger.error("Term "+sTerm1+" had two termids ("+
            termID1+","+termID2+")");
        }
      }
     
      if (compareString <0) {
        los.writeNextEntry(sTerm1, lee1.getValue());
        hasMore1 = lis1.hasNext();
        if (hasMore1) {
          lee1 = lis1.next();
          termID1 = lee1.getValue().getTermId();
          sTerm1 = lee1.getKey();
        }
      } else if (compareString >0) {
        los.writeNextEntry(sTerm2, lee2.getValue());
        hasMore2 = lis2.hasNext();
        if (hasMore2) {
          lee2 = lis2.next();
          termID2 = lee2.getValue().getTermId();
          sTerm2 = lee2.getKey();
        }
      } else /*if (compareString == 0)*/ {
        lee1.getValue().add(lee2.getValue());
        los.writeNextEntry(
          sTerm1,
          lee1.getValue()
        );
        hasMore1 = lis1.hasNext();
        hasMore2 = lis2.hasNext();
        if (hasMore1) {
          lee1 = lis1.next();
          termID1 = lee1.getValue().getTermId();
          sTerm1 = lee1.getKey();
        }
        if (hasMore2) {
          lee2 = lis2.next();
          termID2 = lee2.getValue().getTermId();
          sTerm2 = lee2.getKey();
        }
      }
    }
    if (hasMore1) {
      if (lis2 instanceof Closeable) {
        ((Closeable)lis2).close();
      }

      while (hasMore1) {
        los.writeNextEntry(sTerm1, lee1.getValue());
        hasMore1 = lis1.hasNext();
        if (hasMore1) {
          lee1 = lis1.next();
          termID1 = lee1.getValue().getTermId();
          sTerm1 = lee1.getKey();
        }
      }

      //close input file 1 stream
      if (lis2 instanceof Closeable) {
        ((Closeable)lis2).close();
      }

    } else if (hasMore2) {
      if (lis1 instanceof Closeable) {
        ((Closeable)lis1).close();
      }

      while (hasMore2) {
        los.writeNextEntry(sTerm2, lee2.getValue());
        hasMore2 = lis2.hasNext();
        if (hasMore2) {
          lee2 = lis2.next();
          termID2 = lee2.getValue().getTermId();
          sTerm2 = lee2.getKey();
        }
      }
      //close input file 2 stream
      if (lis2 instanceof Closeable) {
        ((Closeable)lis2).close();
      }
    }
    //close output file streams
    los.close();
  }
 
 
  /** Creates a lexicon index for the specified index
    * @param index Index to make the lexicon index for
    * @deprecated use optimise instead
    */ 
  public static void createLexiconIndex(Index index) throws IOException
  {
    optimise(index, "lexicon");
  }


 
  /** Creates a lexicon hash for the specified index
   * @param index Index to make the LexiconHash the lexicoin
   * @deprecated use optimise instead
   */
  public static void createLexiconHash(final Index index) throws IOException
  {
    optimise(index, "lexicon");
  }
  /** optimise the lexicon */
  public void optimiseLexicon()
  {
    optimise(index, defaultStructureName);
  }
 
  /** Optimises the lexicon, eg lexid file */
  public static void optimise(final Index index, final String structureName)
  {
    try {
      //logger.info("Optimising structure "+structureName);
      CollectionStatisticsCounter counter;
      if (structureName.contains("lexicon"))
      {
        int fieldCount = index.getIntIndexProperty("index.inverted.fields.count", 0);
        if (fieldCount > 0)
        {
          //logger.info(structureName + " has " + fieldCount + " fields");
          counter = new FieldLexiconCollectionStaticticsCounter(index, fieldCount);
        }
        else
        {
          counter = new BasicLexiconCollectionStaticticsCounter(index);
        }
      }
      else
      {
        //other uses of lexicons shouldnt overwrite the tokens in the index
        counter = new NullCollectionStatisticsCounter();
      }
      FSOMapFileLexicon.optimise(structureName, index, counter);
      counter.close();
      index.flush();
    } catch(IOException ioe) {
      logger.error("IOException while creating optimising lexicon called " + structureName, ioe);
    }
  }


  /** return the lexicon input stream for the current index at the specified filename */ 
  @SuppressWarnings("unchecked")
  protected Iterator<Map.Entry<String,LexiconEntry>> getLexInputStream(String structureName) throws IOException
  {
    return new FSOMapFileLexicon.MapFileLexiconIterator(structureName, index.getPath(), index.getPrefix(),
        (FixedSizeWriteableFactory<Text>)index.getIndexStructure(defaultStructureName+"-keyfactory"),
        (FixedSizeWriteableFactory<LexiconEntry>)index.getIndexStructure(defaultStructureName+"-valuefactory"));
  }

  /** return the lexicon outputstream or the current index at the specified filename */
  @SuppressWarnings("unchecked")
  protected LexiconOutputStream<String> getLexOutputStream(String structureName) throws IOException
  {
    return new FSOMapFileLexiconOutputStream(
        index.getPath(), index.getPrefix(),
        structureName,
        (FixedSizeWriteableFactory<Text>)index.getIndexStructure(defaultStructureName+"-keyfactory"));
  }

}
TOP

Related Classes of org.terrier.structures.indexing.LexiconBuilder

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.